# path to the data
path_data = "./data"

# Data wrangling
import pandas as pd
import numpy as np

# Machine learning tools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression


# Interpretable AI
#!pip install lime ipython
from lime.lime_text import LimeTextExplainer
from IPython.display import HTML

# data_rq1_fake = pd.read_csv("rq1_fake_news.csv.gzip",sep="\t",compression="gzip")
# data_rq1_hate_speech = pd.read_csv("rq1_hate_speech.csv.gzip",sep="\t",compression="gzip")
# data_rq1_youtube = pd.read_csv("rq1_youtube.csv.gzip",sep="\t",compression="gzip")
# data_rq2_3 = pd.read_csv("rq2_3_wiki_movie_plots.csv.gzip",sep="\t",compression="gzip")
# data_rq4 = pd.read_csv("rq4_gne-release-v1.0.csv.gzip",sep="\t",compression="gzip")
# data_rq1_fake.shape, data_rq1_hate_speech.shape, data_rq1_youtube.shape, data_rq2_3.shape, data_rq4.shape

df = pd.read_csv(
    f"{path_data}/rq1_hate_speech.csv.gzip", sep="\t", compression="gzip", index_col=0
)
df["label"] = df["label"].map({"hate": 1, "noHate": 0})
df = df[["text", "label"]]
df = df.dropna()
print(df.shape)
df.head()

# split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    df["text"].values, df["label"].values, test_size=0.33, random_state=42
)

# Pipeline
pipe = Pipeline(
    [
        (
            "vectorizer",
            TfidfVectorizer(
                stop_words="english",  # remove stopwords
                lowercase=True,  # convert to lowercase
                token_pattern=r"(?u)\b[A-Za-z][A-Za-z]+\b",
            ),
        ),  # tokens of at least 2 characters
        (
            "clf",
            LogisticRegression(max_iter=10000, dual=False, solver="saga"),
        ),  # logistic regression
    ]
)


# Parameters to hyptertune
param_grid = dict(
    vectorizer__ngram_range=[(1, 1), (1, 2), (1, 3)],  # creation of n-grams
    vectorizer__min_df=[1, 10, 100],  # minimum support for words
    clf__C=[0.1, 1, 10, 100],  # regularization
    clf__penalty=["l2", "l1"],
)  # type of regularization

# Run a grid search using cross-validation to find the best parameters
grid_search = GridSearchCV(pipe, param_grid=param_grid, verbose=True, n_jobs=-1)

# to speed it up we find the hyperparameters using a sample, and fit on the entire datast later
grid_search.fit(X_train[:1000], y_train[:1000])

# best parameters, score and estimator
print(grid_search.best_params_)
print(grid_search.best_score_)

# print resutls
results = pd.DataFrame(grid_search.cv_results_)
results.sort_values(by="mean_test_score", ascending=False).head(10)

# Use the best parameters in the pipe and fit with the entire dataset
pipe = pipe.set_params(**grid_search.best_params_)
clf_best = pipe.fit(X_train, y_train)

# print vocabulary size
print(len(clf_best["vectorizer"].get_feature_names_out()))

# vocabulary
# clf_best["vectorizer"].vocabulary_

# the best score achieved
print(clf_best.score(X_train, y_train))
# the best score achieved
print(clf_best.score(X_test, y_test))

# Add predicitons to dataframe
df["predicted"] = clf_best.predict(df["text"])
df["predicted_prob_hate"] = clf_best.predict_proba(df["text"])[:, 1]
df

# Extract the coeficients from the omdel
coefs = pd.DataFrame(
    [clf_best["vectorizer"].get_feature_names_out(), clf_best["clf"].coef_[0]]
).T
coefs.columns = ["gram", "coef"]

# top words influencing hate
display(coefs.sort_values(by="coef", ascending=False).head(10))

# top words influencing non-hate
display(coefs.sort_values(by="coef", ascending=True).head(10))

# Find some extreme examples
df_confused = df.loc[df["label"] != df["predicted"]]
pred_hate_not_hate = (
    df_confused.loc[df_confused["label"] == 0]
    .sort_values(by="predicted_prob_hate")
    .tail(1)
    .values[0][0]
)
pred_not_hate_hate = (
    df_confused.loc[df_confused["label"] == 1]
    .sort_values(by="predicted_prob_hate")
    .head(1)
    .values[0][0]
)

print("Here")

less_hate = df.sort_values(by="predicted_prob_hate").head(1).values[0][0]
most_hate = df.sort_values(by="predicted_prob_hate").tail(1).values[0][0]

pred_50_50 = "She says the class is out of control and the kids are unteachable , and the black administration does not support her "

print("Least hate: ", less_hate)
print("Most hate: ", most_hate)
print("Predicted very hate but not hateful: ", pred_hate_not_hate)
print("Predicted very innocuous but hateful: ", pred_not_hate_hate)
print("Predicted 50/50: ", pred_50_50)

# start the explainer
explainer = LimeTextExplainer(class_names=["Innocuous", "Hateful"], bow=False)

# shows the explanation for our example instances
for text in [less_hate, most_hate, pred_hate_not_hate, pred_not_hate_hate, pred_50_50]:
    exp = explainer.explain_instance(
        text, clf_best.predict_proba, num_features=10, num_samples=1000
    )
    exp.save_to_file("./lime_explainer_1.html", text=text)
    display(HTML(filename="./lime_explainer_1.html"))
    print(exp.as_list())
    print("-" * 100)

exp = explainer.explain_instance(
    "I believe Dutch people have inferior food and they should be colonized by Belgium",
    clf_best.predict_proba,
    num_features=10,
    num_samples=1000,
)
exp.save_to_file("./lime_explainer_2.html", text=text)
display(HTML(filename="./lime_explainer_2.html"))
print(exp.as_list())
print("-" * 100)

#!pip install scikeras
from scikeras.wrappers import KerasClassifier
from keras._tf_keras.keras.preprocessing.text import Tokenizer
from keras._tf_keras.keras.utils import pad_sequences
from keras._tf_keras.keras.models import Sequential
from keras import layers, utils
import matplotlib.pyplot as plt


def plot_history(history, val=0):
    acc = history["accuracy"]
    if val == 1:
        val_acc = history[
            "val_accuracy"
        ]  # we can add a validation set in our fit function with nn
    loss = history["loss"]
    if val == 1:
        val_loss = history["val_loss"]
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, "b", label="Training accuracy")
    if val == 1:
        plt.plot(x, val_acc, "r", label="Validation accuracy")
    plt.ylabel("accuracy")
    plt.xlabel("epoch")
    plt.title("Accuracy")
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, "b", label="Training loss")
    if val == 1:
        plt.plot(x, val_loss, "r", label="Validation loss")
    plt.ylabel("loss")
    plt.xlabel("epoch")
    plt.title("Loss")
    plt.legend()

## CREATE MODEL
def create_model(
    vocab_size,
    num_filters=64,
    kernel_size=3,
    embedding_dim=50,
    maxlen=100,
    num_classes=2,
):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim))
    model.add(layers.Conv1D(num_filters, kernel_size, activation="relu"))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation="relu"))
    model.add(layers.Dense(num_classes, activation="sigmoid"))
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    model.build((None, maxlen))
    return model


## CLASS FOR PREPROCESSING (needed to work with pipelines)
class preprocessing:
    def __init__(self, num_words=20000, maxlen=100):
        self.maxlen = maxlen
        self.tokenizer = Tokenizer(num_words=num_words)

    def fit(self, X, y=None):
        self.tokenizer.fit_on_texts(X)
        return self

    def transform(self, X, y=None):
        X_ = self.tokenizer.texts_to_sequences(X)
        return pad_sequences(X_, padding="post", maxlen=self.maxlen)

## PROCESS DATA
X_train, X_test, y_train, y_test = train_test_split(
    df["text"].values, df["label"].values, test_size=0.33, random_state=42
)

# Encode the list of newsgroups into categorical integer values
y_train = utils.to_categorical(y_train)
y_test = utils.to_categorical(y_test)

## CREATE PIPELINE 
# Use the best parameters in the pipe and fit with the entire dataset
pipe_preproc = Pipeline([("preproc", preprocessing())])

pipe_preproc.fit(X_train)
X_train_p = pipe_preproc.transform(X_train)
X_test_p = pipe_preproc.transform(X_test)
vocab_size = len(pipe_preproc["preproc"].tokenizer.word_index) + 1
print(vocab_size)

pipe_est = Pipeline(
    [
        (
            "clf",
            KerasClassifier(
                model=create_model,
                vocab_size=vocab_size,
                epochs=10,
                batch_size=64,
                verbose=True,
                num_filters=32,
            ),
        )  # logistic regression
    ]
)

# test it works
pipe_est.fit(X_train_p[:500], y_train[:500])

pipe_est["clf"].model_.summary()

## HYPERPARAMETER TUNING
param_grid = dict(
    clf__model__num_filters=[32, 64, 128],
    clf__model__kernel_size=[3, 5, 7],
    clf__model__embedding_dim=[50, 100],
    clf__verbose=[False],
)

grid = RandomizedSearchCV(
    estimator=pipe_est,
    param_distributions=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=True,
    n_iter=10,
)

grid.fit(X_train_p[:1000], y_train[:1000])

print(grid.best_score_)
print(grid.best_params_)

# Use the best parameters in the pipe and fit with the entire dataset
clf_best = grid.best_estimator_
clf_best = pipe_est.fit(X_train_p, y_train, clf__validation_data=(X_test_p, y_test))

plot_history(clf_best["clf"].history_, val=True)

# Use the best epochs
clf_best = grid.best_estimator_
clf_best = pipe_est.fit(
    X_train_p, y_train, clf__validation_data=(X_test_p, y_test), clf__epochs=4
)

# Find some extreme examples
less_hate = "- YouTube"
most_hate = "Look what happens when Whites leave black Countries alone to do what they do naturally The blacks in White Countries today should be on their knees thanking Whites for trying to civilize them"
pred_hate_not_hate = (
    "Too many whites think they deserve what negroes dish out because of guilt ."
)
pred_not_hate_hate = "https://www.stormfront.org/forum/t1020784/ https : //www.stormfront.org/forum/t102 ... ghlight = sweden https : //www.stormfront.org/forum/t102 ... ghlight = sweden https : //www.stormfront.org/forum/t101 ... ghlight = sweden https : //www.stormfront.org/forum/t101 ... ghlight = sweden https : //www.stormfront.org/forum/t100 ... ghlight = sweden https : //www.stormfront.org/forum/t100 ... ghlight = sweden https : //www.stormfront.org/forum/t100 ... ghlight = sweden God save them ....."
pred_50_50 = "She says the class is out of control and the kids are unteachable , and the black administration does not support her "


print("Least hate: ", less_hate)
print("Most hate: ", most_hate)
print("Predicted very hate but not hate: ", pred_hate_not_hate)
print("Predicted non hate but hate: ", pred_not_hate_hate)
print("Predicted 50/50: ", pred_50_50)

# start the explainer
explainer = LimeTextExplainer(class_names=["Innocuous", "Hate"], bow=False)


# relying on global objects (not too nice)
def create_proba(text):
    t = pipe_preproc.transform(text)
    return clf_best.predict_proba(t)


# shows the explanation for our example instances
for text in [less_hate, most_hate, pred_hate_not_hate, pred_not_hate_hate, pred_50_50]:
    exp = explainer.explain_instance(
        text, create_proba, num_features=10, num_samples=1000
    )
    exp.save_to_file("./lime_explainer_3.html", text=text)
    display(HTML(filename="./lime_explainer_3.html"))
    print(exp.as_list())
    print("-" * 100)

Practical 9: Responsible Text Mining & Applications (OPTIONAL)¶

Text Mining, Transforming Text into Knowledge (202400006)¶

Contents covered in this practical WILL NOT be part of the exam!¶

Classification¶

Example problem 1: Identification of fake news, hate speech or spam + Interpretability of results:¶

Example problem 2: Evaluate the importance of metadata. Create a classification system to identify the movie genre using and excluding metadata:¶

Clustering:¶

Example problem 3: Create a recommendation system for movies based on their plot:¶

Example problem 4: Cluster headlines using word embeddings:¶

Example problem 1: Identification of hate speech¶

Step 1: Read data and create train-test split¶

Step 2: Create pipeline and hyperparameter tuning¶

Step 3: Interpretation of results¶

Interpretation of coefficients in the linear model¶

Interpretation of coefficients using LIME (Local Interpretable Model-Agnostic Explanations)¶

Now it's your turn.¶